Pré-requisitos

Pacotes necessários:

#if(!require(PSF)) install.packages("PSF")
#if(!require(timetk)) remotes::install_github("business-science/timetk")
if(!require(EflowStats)) remotes::install_github("USGS-R/EflowStats")


pacotes <- c(
  "here",
  "usethis",
  "data.table",
  "HEobs",
#  "PSF",
  "tidyverse",
  "lubridate",
  "fs",
  "checkmate",
#  "xts",
#  "hydroGOF",
#  "ModelMetrics",
#  "forecast",
#  "timetk",
  "EflowStats",
  "NbClust",
  "cluster",  
  "cluster.datasets", 
  "cowplot", 
  "clValid",
  "ggfortify", 
  "clustree",
  "dendextend",
  "factoextra",
  "FactoMineR",
  "corrplot",
  "GGally",
  #"ggiraphExtra",
  "kableExtra",
  "tidymodels",
  "bestNormalize",
  "ggradar",
  "ggpubr"

)
# Carregar os pacotes
easypackages::libraries(pacotes)

Scripts:

source(here('R', 'load-data.R'))
source(here('R', 'utils.R'))

Dados

seven_stats_tidy <- readRDS(here("output", "seven_stats.RDS"))

Exploração e visualização das assinaturas hidrológicas

hydrosigns <- select(seven_stats_tidy, -code_stn)

psych::describe(hydrosigns) %>% 
  relocate(skew, kurtosis) %>%
  kable() #%>%   kable_styling()
skew kurtosis vars n mean sd median trimmed mad min max range se
lam1 4.2808943 21.0038092 1 87 1187.7445977 2726.5756901 290.760 534.8943662 328.9444620 11.650 18863.140 18851.490 292.3195975
tau2 -0.3290388 0.6411567 2 87 0.3990805 0.1055289 0.400 0.4023944 0.1037820 0.110 0.650 0.540 0.0113139
tau3 -0.1447398 0.9916048 3 87 0.3731034 0.0880448 0.370 0.3749296 0.0741300 0.120 0.640 0.520 0.0094394
tau4 -0.0919163 0.7960255 4 87 0.2167816 0.0796868 0.220 0.2180282 0.0741300 -0.010 0.470 0.480 0.0085433
ar1 -2.6218779 7.4538085 5 87 0.9288391 0.0785307 0.956 0.9457746 0.0385476 0.581 0.992 0.411 0.0084194
amplitude -0.4038625 -0.9805854 6 87 0.7027586 0.3087472 0.770 0.7143662 0.2816940 0.140 1.300 1.160 0.0331012
phase 1.3537602 -0.0270891 7 87 89.4883333 82.8469966 50.494 77.7218310 12.9935064 19.029 281.270 262.241 8.8821304

Distribuição das assinaturas

Dados brutos

hydrosigns %>% 
  pivot_longer(cols = everything(), names_to = "stats", values_to = "valor") %>%
  ggplot(aes(x=valor)) +
  geom_histogram(fill = "lightblue2", color = "black") + 
  facet_wrap(~stats, scales = "free_x") +
  labs(x = "Value", y = "Frequency") +
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Dados normalizados

hydrosigns_scaled <- hydrosigns %>%
  # normalizados pela media e desv padrao
  #scale() %>% 
  # normalizados entre -1 e 1
  mutate_all(scales::rescale, to = c(-1, 1)) %>%
  as_tibble() 


hydrosigns_scaled_long <- hydrosigns_scaled %>%
  pivot_longer(cols = everything(), names_to = "stats", values_to = "valor") 

# hydrosigns_scaled_long %>%
#   ggplot(aes(x=valor)) +
#   geom_histogram(fill = "lightblue2", color = "black") + 
#   facet_wrap(~stats) +
#   labs(x = "Value", y = "Frequency") +
#   theme_bw()

ggdensity(hydrosigns_scaled_long,
          x = "valor",
          add = "mean", 
          rug = TRUE,
          color = "stats", 
          fill = "stats", alpha = 0.3
          )

# boxplot
# hydrosigns_scaled %>%
#   pivot_longer(cols = everything(), names_to = "stats", values_to = "valor") %>%
#   ggplot(aes(x=valor, y = stats)) +
#   geom_boxplot(fill = "lightblue2", color = "black") + 
#   #facet_wrap(~stats, ncol = 2) +
#   #labs(x = "Value", y = "Frequency") +
#   theme_bw()

hydrosigns_scaled_long %>%
  ggboxplot(x = "stats",
            y = "valor",
            color = "stats", 
            #palette =c("#00AFBB", "#E7B800", "#FC4E07"),
            add = "jitter", 
            shape = "stats", 
            size = 0.5 
            ) #+  coord_flip()
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 7. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 87 rows containing missing values (geom_point).

Distribuição das assinaturas após normalização dos dados.

Relação entre as assinaturas

data_plot <- hydrosigns_scaled

M <- cor(data_plot)

M %>%
  corrplot(type = "upper",
           method = "ellipse", 
           tl.cex = 0.9
           )

alpha <- 0.05
res <- corrplot::cor.mtest(
   data_plot,
   conf.level = 1-alpha
   )
 
 corrplot::corrplot(M,
                    p.mat = res$p,
                    method = "color", 
                    type = "upper",
                    sig.level = c(.001, 0.01, alpha),
                    pch.cex = 1.2,
                    insig = "label_sig",
                    pch.col = "green"
                    #order = "AOE"
                    )

ggpairs(data_plot) + theme_bw()

Redução das dimensões

data_pca <- hydrosigns_scaled
res.pca <- PCA(data_pca,  graph = FALSE)# Visualize eigenvalues/variances
fviz_screeplot(res.pca, addlabels = TRUE)

var <- get_pca_var(res.pca)# Contributions of variables to PC1
c_pc1 <- fviz_contrib(res.pca, choice = "var", axes = 1, top = 10)# Contributions of variables to PC2
c_pc2 <- fviz_contrib(res.pca, choice = "var", axes = 2, top = 10)# Control variable colors using their contributions to the principle axis

vec_pcs <- fviz_pca_var(res.pca, col.var="contrib",
             gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
             repel = TRUE # Avoid text overlapping
             ) + 
  theme_minimal() + 
  ggtitle("Variables - PCA")


cowplot::plot_grid(c_pc1, c_pc2, vec_pcs, align = "v")
## Warning: Graphs cannot be vertically aligned unless the axis parameter is set.
## Placing graphs unaligned.

Análise de agrupamento

K-means

Testando diferentes escolhas para o número de clusters.

data_km <- hydrosigns_scaled
kclusts <- 
  tibble(k = 1:20) %>%
  mutate(
    #kclust = map(k, ~kmeans(hydrosigns_trans_wide, .x)),
    kclust = map(k, ~kmeans(data_km, .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    #augmented = map(kclust, augment, hydrosigns_trans_wide)
    augmented = map(kclust, augment, data_km)
  )
kclusts
## # A tibble: 20 × 5
##        k kclust   tidied             glanced          augmented        
##    <int> <list>   <list>             <list>           <list>           
##  1     1 <kmeans> <tibble [1 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  2     2 <kmeans> <tibble [2 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  3     3 <kmeans> <tibble [3 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  4     4 <kmeans> <tibble [4 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  5     5 <kmeans> <tibble [5 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  6     6 <kmeans> <tibble [6 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  7     7 <kmeans> <tibble [7 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  8     8 <kmeans> <tibble [8 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
##  9     9 <kmeans> <tibble [9 × 10]>  <tibble [1 × 4]> <tibble [87 × 8]>
## 10    10 <kmeans> <tibble [10 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 11    11 <kmeans> <tibble [11 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 12    12 <kmeans> <tibble [12 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 13    13 <kmeans> <tibble [13 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 14    14 <kmeans> <tibble [14 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 15    15 <kmeans> <tibble [15 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 16    16 <kmeans> <tibble [16 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 17    17 <kmeans> <tibble [17 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 18    18 <kmeans> <tibble [18 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 19    19 <kmeans> <tibble [19 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
## 20    20 <kmeans> <tibble [20 × 10]> <tibble [1 × 4]> <tibble [87 × 8]>
clusters <- 
  kclusts %>%
  unnest(cols = c(tidied))

assignments <- 
  kclusts %>% 
  unnest(cols = c(augmented))

clusterings <- 
  kclusts %>%
  unnest(cols = c(glanced))
plts_klust2 <-
  map(kclusts$k,
    function(ik) {
      # ik = 3
      fviz_cluster(kclusts[["kclust"]][[ik]],
                   data = data_km, 
                   show.clust.cent = TRUE,
                   ellipse.type = "convex", 
                   pointsize = 0.8) + 
       theme_bw() + 
        ggtitle(paste0("k = ", ik)) +
        coord_equal()
    }
  )

cowplot::plot_grid(plotlist = plts_klust2[1:10], ncol = 3)

ggplot(clusterings, aes(k, tot.withinss)) +
  geom_line() +
  geom_point()

# seven_stats_wide <- seven_stats_tidy %>%
#   pivot_wider(names_from = code_stn, values_from = statistic)
# seven_stats_wide

diss_matrix<- dist(data_km, method = "euclidean", diag=FALSE)
clust_ana <- NbClust(data_km, 
        diss=diss_matrix,
        distance = NULL, 
        min.nc=2, 
        max.nc=24, 
        method = "ward.D2",
        index = "all")
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 11 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 5 proposed 24 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
clust_ana
## $All.index
##        KL      CH Hartigan     CCC     Scott    Marriot   TrCovW  TraceW
## 2  4.6252 76.2062  23.4752 13.9651  640.7244 39166.7785 157.0478 58.4948
## 3  1.2205 59.6538  18.6877 13.2463  702.9625 43093.7989  73.0601 45.8359
## 4  1.3201 54.1931  14.7994 14.5696  768.1084 36231.9002  37.9928 37.4944
## 5  0.7974 50.9703  17.5610 15.7968  864.8869 18612.4368  31.6149 31.8206
## 6  1.2896 52.3744  15.0022 16.7661  945.6863 10588.2444  23.6509 26.2080
## 7  1.3599 53.5598  12.2066 17.5985 1015.9349  6427.5342  16.8239 22.1125
## 8  1.1872 53.9738  11.0003 18.1495 1112.7651  2758.4339  13.5997 19.1852
## 9  1.1877 54.4798   9.8985 18.6662 1198.6059  1301.5464  10.8300 16.8402
## 10 0.9495 54.9581  10.7674 19.1241 1236.8015  1035.8729   7.5015 14.9438
## 11 1.2626 56.7094   9.2906 19.8719 1363.5717   291.9232   6.1630 13.1105
## 12 1.4273 57.9284   7.1872 22.1052 1398.7648   231.8279   4.3538 11.6824
## 13 0.9686 58.0048   7.5670 22.4597 1447.3502   155.6529   3.4536 10.6608
## 14 0.9893 58.7947   7.9289 22.9672 1501.0601    97.3671   2.7434  9.6718
## 15 0.9924 60.2544   8.3852 23.6129 1545.6020    66.9870   2.0474  8.7242
## 16 1.1916 62.4661   7.5747 24.4094 1594.1140    43.6397   1.4439  7.8141
## 17 1.0890 64.3637   7.3831 25.0737 1651.8198    25.3792   1.2290  7.0609
## 18 1.0070 66.4382   7.7483 25.7374 1683.4700    19.7757   0.9769  6.3872
## 19 1.2162 69.2061   6.8667 26.5210 1736.1500    12.0260   0.8077  5.7424
## 20 1.2362 71.4788   5.9547 27.1421 1771.3019     8.8961   0.7057  5.2157
## 21 0.9972 73.1297   6.2288 27.5880 1809.3598     6.3328   0.5622  4.7900
## 22 1.1156 75.3576   5.9111 28.1286 1850.7356     4.3198   0.4517  4.3769
## 23 1.2169 77.5311   5.1697 28.6245 1883.8179     3.2280   0.3892  4.0120
## 24 0.9883 79.1194   5.4271 28.9722 1960.7576     1.4515   0.3401  3.7122
##    Friedman   Rubin Cindex     DB Silhouette    Duda Pseudot2   Beale Ratkowsky
## 2  125.0244  4.0820 0.2664 0.6980     0.5367  0.7695  20.0696  1.3476    0.3209
## 3  137.3336  5.2094 0.3103 1.2980     0.3891  0.7132  20.5073  1.8007    0.3609
## 4  145.2495  6.3684 0.2539 1.3150     0.4033  0.1141  31.0685 28.3713    0.3617
## 5  157.7350  7.5039 0.2510 1.0022     0.4172  0.5740  10.3916  3.1632    0.3449
## 6  169.7721  9.1109 0.3131 0.8616     0.4268  0.7144  17.9893  1.7856    0.3402
## 7  176.3796 10.7983 0.2607 1.0583     0.3702  0.6105  10.8444  2.7508    0.3216
## 8  188.6396 12.4460 0.2432 0.9411     0.3867  1.5321  -2.0837 -1.3591    0.3072
## 9  220.9753 14.1790 0.3307 0.8789     0.3996  0.4696  18.0703  4.8534    0.2981
## 10 227.9138 15.9784 0.3153 0.8506     0.3585  0.3532   9.1579  6.9691    0.2871
## 11 329.7181 18.2127 0.3413 0.7717     0.3853  0.6781   7.1221  2.0324    0.2784
## 12 340.1906 20.4391 0.3433 0.8237     0.3867  0.6256  15.5612  2.6315    0.2694
## 13 375.6197 22.3978 0.3291 0.8613     0.3357  2.7258  -3.7988 -2.4779    0.2607
## 14 411.5347 24.6882 0.3643 0.8187     0.3517  0.3523  22.0604  7.7482    0.2529
## 15 429.3013 27.3697 0.3440 0.8229     0.3544  0.4493  11.0328  5.0375    0.2459
## 16 445.1417 30.5572 0.3255 0.7878     0.3745  0.4442   5.0049  4.5704    0.2388
## 17 486.0598 33.8172 0.3408 0.7580     0.3729  0.4844   5.3211  4.0493    0.2328
## 18 494.9763 37.3840 0.3282 0.7573     0.3757  2.8525  -1.9483 -2.2239    0.2270
## 19 534.1225 41.5819 0.3391 0.7173     0.3934  0.4632  16.2232  4.9383    0.2219
## 20 543.3145 45.7809 0.3204 0.7358     0.3838 11.9574  -0.9164 -2.0920    0.2170
## 21 560.5969 49.8497 0.3166 0.7040     0.3986  0.3742   5.0169  5.7267    0.2123
## 22 571.5709 54.5543 0.3082 0.6954     0.4072 18.0345   0.0000  0.0000    0.2080
## 23 582.9552 59.5155 0.3060 0.6268     0.4278 16.1311  -1.8760 -2.8552    0.2038
## 24 831.7566 64.3229 0.3019 0.5957     0.4414  0.6007   4.6535  2.6559    0.2000
##       Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
## 2  29.2474     0.6820  1.6224  0.2496 0.4137 0.0197  3.9854 0.6710 0.3764
## 3  15.2786     0.6439  0.1317  0.6131 0.1126 0.0214  5.6466 0.6141 0.7463
## 4   9.3736     0.7155 -0.0760  0.6972 0.1126 0.0236  6.7988 0.5647 0.7190
## 5   6.3641     0.7204  0.2854  0.6962 0.1126 0.0235  3.9294 0.5170 0.2821
## 6   4.3680     0.7290  1.5213  0.7177 0.1476 0.0241  3.6715 0.4808 0.2681
## 7   3.1589     0.5853  0.0756  1.3696 0.1343 0.0249  4.8091 0.4390 0.2509
## 8   2.3981     0.5942 -0.0170  1.3571 0.1343 0.0252  4.6210 0.4129 0.2167
## 9   1.8711     0.5983  0.6322  1.3441 0.1862 0.0261  5.0004 0.3937 0.1591
## 10  1.4944     0.5846  0.1291  1.4301 0.1862 0.0272  4.6845 0.3667 0.1384
## 11  1.1919     0.5859  0.4351  1.4271 0.2059 0.0273  4.4218 0.3432 0.1060
## 12  0.9735     0.5723  3.5190  1.5110 0.2247 0.0276  4.6774 0.3257 0.1127
## 13  0.8201     0.4651  0.0667  2.3743 0.1085 0.0283  6.6233 0.3048 0.1063
## 14  0.6908     0.4664  0.4753  2.3534 0.1235 0.0284  6.4531 0.2928 0.0937
## 15  0.5816     0.4493  0.1897  2.5277 0.1235 0.0292  6.3417 0.2767 0.0869
## 16  0.4884     0.4454  0.1265  2.5465 0.1235 0.0293  6.3666 0.2601 0.0759
## 17  0.4153     0.4446  0.1662  2.5347 0.1336 0.0294  6.4637 0.2511 0.0739
## 18  0.3548     0.4425  0.0910  2.5334 0.1336 0.0296  6.3960 0.2399 0.0680
## 19  0.3022     0.4425  1.0280  2.5160 0.1651 0.0297  6.2660 0.2285 0.0599
## 20  0.2608     0.4010  0.1047  3.0403 0.1651 0.0299  7.6759 0.2136 0.0562
## 21  0.2281     0.4006  0.1772  3.0318 0.1651 0.0301  8.1044 0.2037 0.0479
## 22  0.1989     0.3984  0.0845  3.0367 0.1651 0.0303  8.1269 0.1961 0.0451
## 23  0.1744     0.3983  0.1875  3.0294 0.1857 0.0303  8.0832 0.1863 0.0329
## 24  0.1547     0.3971  0.3873  3.0328 0.2064 0.0304  8.0393 0.1784 0.0285
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.7154            26.6504       0.2259
## 3          0.6881            23.1145       0.0860
## 4          0.2524            11.8459       0.0000
## 5          0.5070            13.6158       0.0047
## 6          0.6744            21.7214       0.0895
## 7          0.5401            14.4778       0.0110
## 8          0.3404            11.6263       1.0000
## 9          0.5300            14.1914       0.0001
## 10         0.3011            11.6036       0.0000
## 11         0.5190            13.9039       0.0577
## 12         0.6051            16.9684       0.0130
## 13         0.3404            11.6263       1.0000
## 14         0.4792            13.0421       0.0000
## 15         0.4241            12.2211       0.0001
## 16         0.2524            11.8459       0.0017
## 17         0.3011            11.6036       0.0024
## 18         0.1898            12.8095       1.0000
## 19         0.5070            13.6158       0.0001
## 20        -0.0196           -52.1455       1.0000
## 21         0.1898            12.8095       0.0008
## 22        -0.2283             0.0000          NaN
## 23         0.1049            17.0735       1.0000
## 24         0.3729            11.7706       0.0207
## 
## $Best.nc
##                     KL      CH Hartigan     CCC    Scott  Marriot  TrCovW
## Number_clusters 2.0000 24.0000   3.0000 24.0000  11.0000    5.000  3.0000
## Value_Index     4.6252 79.1194   4.7875 28.9722 126.7702 9595.271 83.9877
##                 TraceW Friedman   Rubin Cindex      DB Silhouette   Duda
## Number_clusters 3.0000  24.0000 12.0000 8.0000 24.0000     2.0000 2.0000
## Value_Index     4.3174 248.8014 -0.2677 0.2432  0.5957     0.5367 0.7695
##                 PseudoT2  Beale Ratkowsky    Ball PtBiserial   Frey McClain
## Number_clusters   2.0000 2.0000    4.0000  3.0000      6.000 2.0000  2.0000
## Value_Index      20.0696 1.3476    0.3617 13.9688      0.729 1.6224  0.2496
##                   Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 2.0000      0  6.0000      0 24.0000
## Value_Index     0.4137      0  3.6715      0  0.0285
## 
## $Best.partition
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 1 1 1 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [77] 1 1 1 1 1 2 1 1 1 1 1

Nº ótimo de clusters

Método ‘Elbow’

set.seed(31)
# function to compute total within-cluster sum of squares
fviz_nbclust(hydrosigns_scaled,
  kmeans,
  method = "wss",
  k.max = 24
) +
  theme_bw() +
  ggtitle("the Elbow Method")

Estatística GAP

gap_stat <- clusGap(hydrosigns_scaled, 
                    FUN = kmeans, 
                    nstart = 30, 
                    K.max = 24, 
                    B = 50)
fviz_gap_stat(gap_stat) + 
  theme_bw() + 
  ggtitle("fviz_gap_stat: Gap Statistic")

Método silhueta

fviz_nbclust(hydrosigns_scaled, 
             kmeans, 
             method = "silhouette", 
             k.max = 24) + 
  theme_minimal() + 
  ggtitle("The Silhouette Plot")

Método das somas quadráticas

kclusts_ss <- mutate(kclusts,
    within_ss = map_dbl(k, ~mean(kclusts[["kclust"]][[.x]][["withinss"]])),   
    between_ss = map_dbl(k, ~mean(kclusts[["kclust"]][[.x]][["betweenss"]]))
    ) %>%
  select(k, contains("_ss"))

kclusts_ss_long <-  kclusts_ss %>%
  pivot_longer(cols = -k, names_to = "measure", values_to = "valor")

kclusts_ss_long %>% 
  filter(k > 1) %>%
  ggplot(., aes(x=k, y=log10(valor), fill = measure)) + 
  geom_bar(stat = "identity", position = "dodge") + 
  ggtitle("Cluster Model Comparison") + 
  xlab("Number of Clusters") + 
  ylab("Log10 Total Sum of Squares")  

  #scale_x_discrete(name = "Number of Clusters", limits = c("0", "2", "3", "4", "5", "6", "7", "8"))

NbClust

O pacote NbClust fornece 30 índices para determinar o o número relevante de clusters propõ ao usuário o melfor esquema de agrupamento dos diferentes resultados obtidos da variação de todas combinações de nº de cluesters, medidas de distância e métodos de agrupamentos.

nbc_scaled <- NbClust(hydrosigns_scaled, 
                      distance = "euclidean",
                      min.nc = 2, 
                      max.nc = 25, 
                      #method = "complete", 
                      method = "ward.D2",
                      index ="all"
                      )
## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 2 proposed 6 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 1 proposed 11 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 2 proposed 24 as the best number of clusters 
## * 3 proposed 25 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
factoextra::fviz_nbclust(nbc_scaled) + 
  theme_bw() + 
  ggtitle("NbClust's optimal number of clusters")
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 8 proposed  2 as the best number of clusters
## * 4 proposed  3 as the best number of clusters
## * 1 proposed  4 as the best number of clusters
## * 1 proposed  5 as the best number of clusters
## * 2 proposed  6 as the best number of clusters
## * 1 proposed  8 as the best number of clusters
## * 1 proposed  11 as the best number of clusters
## * 1 proposed  12 as the best number of clusters
## * 2 proposed  24 as the best number of clusters
## * 3 proposed  25 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  2 .

nbc_scaled2 <- NbClust(hydrosigns_scaled,
  distance = "euclidean",
  min.nc = 2,
  max.nc = 10,
  method = "complete",
  index = "all"
)

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 5 proposed 2 as the best number of clusters 
## * 2 proposed 3 as the best number of clusters 
## * 7 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 6 as the best number of clusters 
## * 4 proposed 9 as the best number of clusters 
## * 3 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  4 
##  
##  
## *******************************************************************
factoextra::fviz_nbclust(nbc_scaled2) +
  theme_bw() +
  ggtitle("NbClust's optimal number of clusters")
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") .viz_NbClust(x, print.summary, : the
## condition has length > 1 and only the first element will be used
## Warning in if (class(best_nc) == "numeric") print(best_nc) else if
## (class(best_nc) == : the condition has length > 1 and only the first element
## will be used
## Warning in if (class(best_nc) == "matrix") {: the condition has length > 1 and
## only the first element will be used
## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 5 proposed  2 as the best number of clusters
## * 2 proposed  3 as the best number of clusters
## * 7 proposed  4 as the best number of clusters
## * 1 proposed  5 as the best number of clusters
## * 1 proposed  6 as the best number of clusters
## * 4 proposed  9 as the best number of clusters
## * 3 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  4 .

Clustree

tmp <- NULL
for (k in 1:15){
  tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30)
}
## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length

## Warning in tmp[k] <- kmeans(hydrosigns_scaled, k, nstart = 30): number of items
## to replace is not a multiple of replacement length
df <- data.frame(tmp)# add a prefix to the column names
colnames(df) <- seq(1:15)
colnames(df) <- paste0("k", colnames(df))# get individual PCA
df.pca <- prcomp(df, center = TRUE, scale. = FALSE)
ind.coord <- df.pca$x
ind.coord <- ind.coord[,1:2]
df <- bind_cols(as.data.frame(df), as.data.frame(ind.coord))
clustree(df, prefix = "k")
## Warning: The `add` argument of `group_by()` is deprecated as of dplyr 1.0.0.
## Please use the `.add` argument instead.

Escolha do algoritmo apropriado

tmp <- as.data.frame(hydrosigns_scaled) 
rownames(tmp) <- seven_stats_tidy$code_stn
intern <- clValid(tmp, 
                  nClust = 2:24, 
                  clMethods = c("hierarchical","kmeans","pam"), 
                  validation = "internal")# Summary
summary(intern) %>% kable() %>% kable_styling()
## 
## Clustering Methods:
##  hierarchical kmeans pam 
## 
## Cluster sizes:
##  2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
## 
## Validation Measures:
##                                   2        3        4        5        6        7        8        9       10       11       12       13       14       15       16       17       18       19       20       21       22       23       24
##                                                                                                                                                                                                                                          
## hierarchical Connectivity    2.9290   2.9290   8.0337  13.0651  16.8119  19.6298  23.7238  29.2992  36.3036  40.5282  42.6948  47.7702  54.1623  56.6623  58.6956  63.2123  64.9623  67.8913  69.9841  72.2619  78.5302  84.6599  86.9933
##              Dunn            0.4037   0.4037   0.2151   0.2162   0.2362   0.2583   0.2583   0.1679   0.2220   0.2220   0.2220   0.2615   0.2924   0.2952   0.2952   0.2952   0.2952   0.2952   0.2952   0.2952   0.2668   0.3536   0.3536
##              Silhouette      0.4737   0.5167   0.4918   0.4852   0.4867   0.4333   0.3974   0.4213   0.4639   0.4581   0.4512   0.4137   0.3850   0.3724   0.3710   0.3670   0.3596   0.3278   0.3235   0.3190   0.3216   0.3498   0.3485
## kmeans       Connectivity    3.4623  17.8766  23.7595  29.3294  32.7944  36.3377  45.6552  42.2679  43.3929  51.2139  53.3806  58.4560  66.7734  70.5778  72.6111  76.9734  76.2865  85.2619  87.2381  89.5159  95.7841  96.0341  98.3675
##              Dunn            0.2129   0.0299   0.0603   0.0757   0.0815   0.1232   0.0479   0.1544   0.1544   0.1790   0.1790   0.1824   0.1720   0.1720   0.1720   0.1720   0.1985   0.0917   0.0917   0.0917   0.0999   0.1249   0.1249
##              Silhouette      0.5353   0.3302   0.4101   0.4245   0.4339   0.4430   0.4105   0.4626   0.4579   0.4155   0.4133   0.3722   0.3627   0.3806   0.3783   0.3762   0.3710   0.3660   0.3680   0.3672   0.3699   0.3666   0.3653
## pam          Connectivity    3.4623  18.4782  32.4012  34.1742  41.2405  49.7294  54.8048  57.4694  60.4583  62.0250  65.3052  67.5885  70.3266  76.5948  81.3948  85.7286  86.6869  88.8536  90.8869  98.4313 100.9313 103.2091 104.9591
##              Dunn            0.2129   0.0305   0.0484   0.0484   0.0345   0.0702   0.0702   0.0751   0.0751   0.0751   0.0751   0.1047   0.1047   0.1047   0.1047   0.1085   0.1313   0.1313   0.1313   0.1313   0.1355   0.1355   0.1713
##              Silhouette      0.5353   0.3231   0.2811   0.3266   0.4030   0.3208   0.2784   0.2999   0.2927   0.3153   0.3303   0.3406   0.3594   0.3621   0.3642   0.3322   0.3388   0.3449   0.3468   0.3454   0.3357   0.3379   0.3433
## 
## Optimal Scores:
## 
##              Score  Method       Clusters
## Connectivity 2.9290 hierarchical 2       
## Dunn         0.4037 hierarchical 2       
## Silhouette   0.5353 kmeans       2
# Compute dissimilarity matrix with euclidean distances
d <- dist(hydrosigns_scaled, method = "euclidean")# Hierarchical clustering using Ward's method
res.hc <- hclust(d, method = "ward.D2" )# Cut tree into 5 groups
grp <- cutree(res.hc, k = 2)# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 2, border = 2:5) # add rectangle

res.hc2 <- hclust(d, method = "complete" )# Cut tree into 5 groups
grp <- cutree(res.hc2, k = 3)# Visualize
plot(res.hc2, cex = 0.6) # plot tree
rect.hclust(res.hc2, k = 3, border = 2:6) # add rectangle

# Execution of k-means with k=5
final2 <- kmeans(hydrosigns_scaled, 2, nstart = 30)
fviz_cluster(final2, data = hydrosigns_scaled) + theme_bw() + ggtitle("k = 2")

final4 <- kmeans(hydrosigns_scaled, 4, nstart = 30)
fviz_cluster(final4, data = hydrosigns_scaled) + theme_bw() + ggtitle("k = 4")

as.data.frame(seven_stats_tidy) %>%
  mutate(Cluster = nbc_scaled$Best.partition) %>% 
  group_by(Cluster) %>% 
  summarise_all("mean") %>%
  kable() 
Cluster code_stn lam1 tau2 tau3 tau4 ar1 amplitude phase
1 176.2609 1380.3913 0.3789855 0.3552174 0.2002899 0.9335507 0.8205797 48.01441
2 119.3889 449.2656 0.4761111 0.4416667 0.2800000 0.9107778 0.2511111 248.47172
hydrosigns_df <- as.data.frame(hydrosigns_scaled) %>% 
  mutate(code_stn = seven_stats_tidy$code_stn)

cluster_pos <- as.data.frame(nbc_scaled$Best.partition) %>% 
  mutate(code_stn = seven_stats_tidy$code_stn)
colnames(cluster_pos) <- c("cluster", "code_stn")

hydrosigns_final <- inner_join(cluster_pos, hydrosigns_df) %>%
  mutate(cluster = factor(cluster))
## Joining, by = "code_stn"
hydrosigns_radar <- hydrosigns_final %>%
  select(-code_stn) %>%
  group_by(cluster) %>%
  mutate_at(vars(-cluster), scales::rescale) %>%
  summarise_all(.funs = mean) %>%
  ungroup() 

hydrosigns_radar %>%
  ggradar(
    font.radar = "roboto",
    grid.label.size = 10,  # Affects the grid annotations (0%, 50%, etc.)
    axis.label.size = 6.5, # Afftects the names of the variables
    group.point.size = 2   # Simply the size of the point 
  )

table(hydrosigns_final$cluster)
## 
##  1  2 
## 69 18
ggpairs(hydrosigns_final, 
        columns = 3:ncol(hydrosigns_final), 
        mapping = aes(color = factor(cluster), alpha = 0.5), 
        diag = list(continuous = wrap("densityDiag")), 
        lower=list(continuous = wrap("points", alpha=0.9)))

# Parallel coordiante plots allow us to put each feature on seperate column and lines connecting each column
ggparcoord(data = hydrosigns_final,
           columns = 3:9,
           groupColumn = 1, 
           alphaLines = 0.4, 
           title = "Parallel Coordinate Plot for the HydroSigns", 
           scale = "globalminmax", 
           showPoints = TRUE, ) + 
  theme(legend.position = "bottom")

hydrosigns_final %>%
  pivot_longer(cols = -(cluster:code_stn), 
               names_to = "stats", 
               values_to = "valor") %>%
  ggplot(aes(x=cluster, y = valor)) +
  geom_boxplot(aes(fill = stats), color = "black") + 
  #coord_flip() +
  #facet_wrap(~stats, ncol = 2) +
  #labs(x = "Value", y = "Frequency") +
  theme_bw()

Refs

https://towardsdatascience.com/10-tips-for-choosing-the-optimal-number-of-clusters-277e93d72d92

https://www.tidymodels.org/learn/statistics/k-means/